%load_ext autoreload
%reload_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import numpy as np
import dataset_wrapper
import visualize
import parsing
import numpy as np
from sklearn.metrics import jaccard_similarity_score
%matplotlib inline
# make function handle i and o countour types
def batch_wrapper_keys(_linkfile_path, contour_type='o'):
'''main function to call, returns DataSet Obeject'''
all_path2contours = []
contour_files, _, pairs = dataset_wrapper.get_file_pairs(_linkfile_path)
contour_paths = ['./final_data/contourfiles/'+ x +'/'+ contour_type +'-contours' for x in contour_files]
for _path in contour_paths:
path2contours, contours_avail = dataset_wrapper.get_files_in_path(_path)
all_path2contours.extend([path2contours + '/' + x for x in contours_avail])
# fix: memoize for scalability
dicoms_paths = [pairs[dataset_wrapper.match_dicom_path(x)[0]] + '/' + dataset_wrapper.match_dicom_path(x)[1] for x in all_path2contours]
return dicoms_paths, all_path2contours
o=batch_wrapper_keys('/Users/mengningshang/Desktop/Dev_Env/medseg/final_data/link.csv',
contour_type='o')
i=batch_wrapper_keys('/Users/mengningshang/Desktop/Dev_Env/medseg/final_data/link.csv',
contour_type='i')
obatch=o
ibatch=i
import sys
def match_contour_pairs(i, o):
'''
this pairs i and o countours
returns tuples with dicom path as key and i,o countour path as value
'''
matching_dicoms=set(i[0]).intersection(o[0])
# print(len(matching_dicoms),len(i[0]), len(o[0]), matching_dicoms)
i_zip=zip(i[0],i[1])
o_zip=zip(o[0],o[1])
i = [a for a in i_zip if a[0] in matching_dicoms]
o = [a for a in o_zip if a[0] in matching_dicoms]
print(len(i), len(o))
i_dicom, i_contour = zip(*list(i))
o_dicom, o_contour = zip(*list(o))
i = zip(i_dicom,i_contour)
o = zip(o_dicom,o_contour)
return i,o
i_f, o_f = match_contour_pairs(i,o)
# make dictionary for ease of use
odict=dict((x, y) for x, y in o_f)
idict=dict((x, y) for x, y in i_f)
print(list(idict.keys()))
def make_mask_sets():
rings = {}
oms = {}
ims = {}
extras={}
for x in odict.keys():
om=parsing.poly_to_mask(parsing.parse_contour_file(odict[x]), 256, 256)
im=parsing.poly_to_mask(parsing.parse_contour_file(idict[x]), 256, 256)
# print(sum(om*1-im*1))
# print(x, om.shape,im.shape)
rings[x]=om*1-im*1
file_path=odict[x].split('/')[-3:]
oms[x]={'mask': om*1,'file_path': file_path}
ims[x]={'mask': im*1,'file_path': file_path}
extras[x]=np.logical_not(om*1)#oms[x]['mask']
return rings, oms, ims, extras
rings, oms, ims, extras = make_mask_sets()
def select_region(img, _mask):
'''
selects pixel values region according to mask
'''
# print(_mask.shape, img.shape)
img=img['pixel_data']
pixels = []
for i in range(len(img)):
for j in range(i):
if _mask['mask'][i][j] !=0:
pixels.append(img[i][j])
# print(pixels)
return pixels
len(rings.keys())
key = 'SCD0000201/80.dcm'#list(rings.keys())[11] #list(rings.keys())[11] #res['SCD0000501/199.dcm']
print(key)
dic = parsing.parse_dicom_file(dataset_wrapper.DATA_PATH+'/dicoms/'+key)
visualize.plot_overlay(dic, ims[key]) # quick visual check
def plot_densityHist(_key, ims=ims, oms=oms):
'''
graphically compare ring and blood pool (inner) pixel density
'''
dicom = parsing.parse_dicom_file(dataset_wrapper.DATA_PATH+'/dicoms/'+_key)
ring_crop = select_region(dicom, {'mask': rings[_key]})
inner_crop = select_region(dicom, ims[_key])
outter_crop = select_region(dicom, oms[_key])
plt.hist(ring_crop, alpha=0.5, label='muscle') # bins,
plt.hist(inner_crop, alpha=0.5, label='blood_pool')
plt.legend(loc='upper right')
plt.show()
plot_densityHist('SCD0000101/219.dcm')
def iou_jaccard(tpix, ipix):
intersection = tpix.intersection(ipix)
union = tpix.union(ipix)
jaccard = len(intersection)/float(len(union)) # prevent overflow
return jaccard
def mask_to_pixel_index(_mask):
pixels_index = set([])
for i in range(len(_mask)):
for j in range(i):
if _mask[i][j] !=0:
pixels_index.add((i,j))
return pixels_index
k2 = 'SCD0000401/180.dcm'
def iou_score(key, bmask, _ims):
kt=key
tdic = parsing.parse_dicom_file(dataset_wrapper.DATA_PATH+'/dicoms/'+kt)
# bmask =binary_min
tpix = mask_to_pixel_index(bmask)
ipix = mask_to_pixel_index(_ims[kt]['mask'])
pad_to=max(len(bmask), len(bmask))
mask_to_pixel_index(_ims[kt]['mask'])
return iou_jaccard(tpix,ipix)
from skimage.filters import threshold_minimum,threshold_minimum, threshold_otsu, try_all_threshold
from PIL import Image, ImageDraw
from scipy.stats import ks_2samp
def visual_report(_key, white_background=True, plot=False):
'''
generate diagnostic stats and plots for a given dicom
'''
dicom= parsing.parse_dicom_file(dataset_wrapper.DATA_PATH+'/dicoms/'+_key)
outter_crop = select_region(dicom, oms[_key])
val =threshold_otsu(np.asarray(outter_crop))
image_ = dicom['pixel_data']
thresh_min=val
binary_min = image_*oms[_key]['mask'] > val
binary_min_whiteoutExtra=binary_min+extras[_key]
thresh_crop = select_region(dicom, {'mask': binary_min})
inner_crop = select_region(dicom, ims[_key])
# visualize.plot_overlay(dicom, thresh_crop, _name=oms[_key]['file_path'])
# visualize.plot_overlay(dicom, inner_crop, _name=oms[_key]['file_path'])
if plot ==True:
visualize.plot_overlay(dicom, {'mask': binary_min}, _name=oms[_key]['file_path'])
fig, ax = plt.subplots(2, 2, figsize=(10, 10))
ax[0, 0].imshow(image_, cmap=plt.cm.gray)
ax[0, 0].set_title('Original')
ax[0, 1].hist(image_.ravel(), bins=256)
ax[0, 1].set_title('Histogram')
ax[1, 0].imshow(binary_min_whiteoutExtra, cmap=plt.cm.gray)
if white_background:
ax[1, 0].imshow(binary_min_whiteoutExtra, cmap=plt.cm.gray)
ax[1, 0].set_title('Thresholded (min)')
ax[1, 0].imshow(ims[_key]['mask'], alpha=.4, cmap=visualize.CMAPBLUE)
ax[1, 1].hist(image_.ravel(), bins=256)
ax[1, 1].axvline(thresh_min, color='r')
for a in ax[:, 0]:
a.axis('off')
plt.show()
print(ks_2samp(inner_crop, thresh_crop), 'iou:', iou_score(_key,binary_min, ims), ks_2samp(inner_crop, thresh_crop))
return _key, ks_2samp(inner_crop, thresh_crop), iou_score(_key,binary_min, ims), ks_2samp(inner_crop, thresh_crop)
visual_report('SCD0000101/219.dcm', plot=True)
ks_score_dict={}
ious=[]
ks_score=[]
for k in rings.keys():
plot_densityHist(k)
k, ks2samp_score, iou_val = visual_report(k, plot=True)
ks_score_dict[k]=ks2samp_score
ious.append(iou_val)
ks_score.append(ks2samp_score.pvalue)
ious
high =[x for x in ious if x > .75]
print(high)
print(len(high)/len(ks_score))
ks_score
lowp =[x for x in ks_score if x < .05]
print(lowp)
print(len(lowp)/len(ks_score))
'''
p2.1
- most of the changes to pipeline code is presented in this notebook
- batch_wrapper is modified => batch_wrapper_keys to parse i and o contours
- helper functions were added to match i and o countours and make additional visualizations for exploratory purposes
- these changes are first made in jupyternotebook so i can test for backwards compatibility and unify the interface
i.e. many helper functions (visualize.plot_overlay) expects a dictionary object for contours,
this makes it ugly to have to call the helpers with a artifical dictonary object wrapped around the array values
which should be unified (helpers should check to see if object type is dict or array and handle accordingly)
- most of these functions can be re-integrated into the code base after more testing etc...
- some of the code is one time use / for exploration there for they are global scope in the jupyter notebook there fore I used a number of named parameters in the code for faster iterations
- these should really be used for "options" so that needs to get cleaned up when integration into the code-base
p2.2
- i plotted the distribution (position agnostic) of the pixels inside the i-contour vs between the i-contour and o-countours (muscle)
- usually there are 2 peaks in this distribution, since blood pool is darker
- i then check the distribution of the pixel density via 2 sample ks test using a random subset.
- this is a parametric test of "distance" between the 2 sampling distribution
- if pvalue is significant, we can reject the hypothesis that the distribusion of pixel densitiy in blood pool (bp) vs muscle is the same
- if pvalue is very small, it gives me more confidence that we can achieve some kind of reasonable segementation based on threshold
- weather the threshold segments are sufficient depends on the business requirement
- but looking at the distribution histograms we commonly see overlaps which means I don't expect the threshold segmentation to be extremely accurate
- i then proceeded to compute a threshold value using one common method "threshold_otsu"
- I did try_all_threshold on a random subset and visually picked the best performing (method) from the try_all_threshold set
- I then computed a bloodpool segmentation mask based on the otsu threshold value (based on the region inside the outter contour mask)
- after applying the threshold I compared the IOU score for my own threshold based segmentation vs the gold labeled version
- custom implemented a IOU_score function that accounts for different sizes in the region
- I also plotted the result of each contour pair with IOU score and ks test pvalues from my segmentation results
- after inspecting the IOU scores, and ks-pvalues for all examples, I don't think this thereshold method performs well
- slightly more than 1/2 of the dicoms with both contours gave a IOU score of > .75 which is quite low for a medical setting
- further more the ks-pvalues are also very small in most cases (indicating that my segmentation's pixel density is very different from the gorund truth distribution)
- I would expect low probability of generating high quality segments based on heuristics along
- some other ideas I researched and wanted to try (with the expectation of achieving small improvements are):
- strech density distribution to imrove contrast inside o-contour
- canny edge detection inside the o-contour region
- flood-filling
- normalizing the images could improve segmentation results (similar to the first idea)
- try more "dynamic" ways of finding thresholds, (kmean is a clear option but that's ML)
'''